Exogenous RNA Coverage Plots

Author

Lance Parsons

Load libraries

This project uses renv to keep track of installed packages. Install renv if not installed and load dependencies with renv::restore().

install.packages("renv")
renv::restore()

Read sample data

Get list of samples

Code
samples <- read_tsv("config/samples.tsv",
  col_types = list(
    sample_name = col_character(),
    cell_line = col_factor(),
    exogenous_rna = col_factor(),
    day = col_factor()
  )
)
units <- read_tsv("config/units.tsv",
  col_types = list(
    sample_name = col_character(),
    unit_name = col_character(),
    fq1 = col_character(),
    fq2 = col_character()
  )
)
sample_units <- dplyr::left_join(samples, units, by = "sample_name") %>%
  unite(sample_unit, sample_name, unit_name, remove = FALSE) %>%
  mutate(
    day = as.factor(paste0("day", day)),
    batch = as.factor(case_when(
      exogenous_rna == "mastermix1" ~ "batch3",
      exogenous_rna == "mastermix2" ~ "batch3",
      TRUE ~ "batch2"
    )),
    .before = cell_line,
  )

sample_units

Table of exogenous RNA mixtures

Code
# Exogenous RNA mixtures
rna_mixes <- tibble()
for (mix in c("mastermix1", "mastermix2", "PJY103_mDNMT1", "PJY300_mDNMT1")) {
  t <- Biostrings::readDNAStringSet(sprintf("data/references/%s.fa", mix))
  rna_mixes <- rbind(rna_mixes, tibble(
    exogenous_rna = mix,
    rna_species = word(t@ranges@NAMES, 1),
    start = 1,
    end = t@ranges@width
  ))
}

rna_mixes <- rna_mixes %>%
  mutate(
    active_cis_start_max = case_when(
      exogenous_rna == "PJY103_mDNMT1" ~ 16,
      exogenous_rna == "PJY300_mDNMT1" ~ 16,
      exogenous_rna == "mastermix1" ~ 15,
      exogenous_rna == "mastermix2" ~ 15
    ),
    active_cis_end_min = case_when(
      exogenous_rna == "PJY103_mDNMT1" ~ 74,
      exogenous_rna == "PJY300_mDNMT1" ~ 74,
      exogenous_rna == "mastermix1" ~ 73,
      exogenous_rna == "mastermix2" ~ 73
    ),
    inactive_ct_end_max = case_when(
      exogenous_rna == "PJY103_mDNMT1" ~ 38,
      exogenous_rna == "PJY300_mDNMT1" ~ 38,
      exogenous_rna == "mastermix1" ~ 37,
      exogenous_rna == "mastermix2" ~ 37
    ),
    active_trans_start_max = case_when(
      rna_species == "PJY103_mDNMT1" ~ 115,
      rna_species == "PJY300_mDNMT1" ~ 115,
      rna_species == "PJY179_FANCF" ~ 119,
      rna_species == "PJY181_HEK3" ~ 120,
      rna_species == "PJY182_HEK3" ~ 120,
      rna_species == "PJY183_DNMT1" ~ 113,
      rna_species == "PJY184_DNMT1" ~ 113,
      rna_species == "PJY186_RUNX1" ~ 117,
      rna_species == "PJY185_RUNX1" ~ 117,
      rna_species == "PJY187_VEGFA" ~ 124,
      rna_species == "PJY306_EMX1" ~ 118,
      rna_species == "PJY302_EMX1" ~ 118,
      rna_species == "PJY177_RNF2" ~ 120
    ),
    active_trans_end_min = case_when(
      rna_species == "PJY103_mDNMT1" ~ 121,
      rna_species == "PJY300_mDNMT1" ~ 121,
      rna_species == "PJY179_FANCF" ~ 124,
      rna_species == "PJY181_HEK3" ~ 121,
      rna_species == "PJY182_HEK3" ~ 121,
      rna_species == "PJY183_DNMT1" ~ 118,
      rna_species == "PJY184_DNMT1" ~ 118,
      rna_species == "PJY186_RUNX1" ~ 122,
      rna_species == "PJY185_RUNX1" ~ 122,
      rna_species == "PJY187_VEGFA" ~ 129,
      rna_species == "PJY306_EMX1" ~ 123,
      rna_species == "PJY302_EMX1" ~ 123,
      rna_species == "PJY177_RNF2" ~ 121
    ),
  )

rna_mixes

Exogenous RNA counts

BAM reading functions

Code
rna_species_plot_range <- function(sequence_name) {
  plot_range <- rna_mixes %>%
    dplyr::filter(.data$rna_species == sequence_name) %>%
    dplyr::select(start, end) %>%
    unique()
  return(plot_range)
}

# GRanges from BAM
granges_from_bam <- function(sample_unit,
                             sequence_name,
                             is_proper_pair = TRUE,
                             bam_dir =
                               "results/alignments/exogenous_rna/sorted") {
  plot_range <- rna_species_plot_range(sequence_name)
  which <- GRanges(
    sprintf("%s:%i-%i", sequence_name, plot_range$start, plot_range$end)
  )

  param <- ScanBamParam(
    flag = scanBamFlag(isProperPair = is_proper_pair),
    mapqFilter = 1,
    which = which
  )

  aligned_fragments_list <- list()

  if (is_proper_pair) {
    aligned_fragments <- granges(
      readGAlignmentPairs(
        sprintf("%s/%s.bam", bam_dir, sample_unit),
        param = param
      ),
      on.discordant.seqnames = "drop"
    )
    rna_info <- rna_mixes %>%
      dplyr::filter(rna_species == {{ sequence_name }}) %>%
      dplyr::select(-"exogenous_rna") %>%
      unique()

    # Active in cis
    aligned_fragments_list$active_cis <- aligned_fragments %>%
      plyranges::filter(
        start <= rna_info$active_cis_start_max,
        end >= rna_info$active_cis_end_min
      )

    # Active in trans
    aligned_fragments_list$active_trans <- aligned_fragments %>%
      plyranges::filter(
        start > rna_info$active_cis_start_max,
        start <= rna_info$active_trans_start_max,
        end >= rna_info$active_trans_end_min
      )

    # Inactive Cryptic Termination
    aligned_fragments_list$inactive_cryptic_terminiation <-
      aligned_fragments %>%
      plyranges::filter(end < rna_info$inactive_ct_end_max)

    # Inactive Other
    aligned_fragments_list$inactive_other <- plyranges::bind_ranges(
      aligned_fragments %>%
        plyranges::filter(
          start <= rna_info$active_cis_start_max,
          end < rna_info$active_cis_end_min,
          end > rna_info$inactive_ct_end_max
        ),
      aligned_fragments %>%
        plyranges::filter(
          start > rna_info$active_cis_start_max,
          (
            start > rna_info$active_trans_start_max |
              end < rna_info$active_trans_end_min
          )
        ),
    )
  } else {
    aligned_fragments <- granges(
      readGAlignments(
        sprintf("%s/%s.bam", bam_dir, sample_unit),
        param = param
      )
    )
    aligned_fragments_list$discordant <- aligned_fragments
  }
  return(aligned_fragments_list)
}

Read BAM coverage

Code
rna_species_plot_data <- list()
for (rna_species_to_plot in rna_mixes$rna_species) {
  rna_info <- rna_mixes %>%
    dplyr::filter(rna_species == rna_species_to_plot)

  sample_units_to_plot <- sample_units %>%
    dplyr::filter(exogenous_rna %in% rna_info$exogenous_rna)

  granges_to_plot <- list()
  for (sample_unit in sample_units_to_plot$sample_unit) {
    granges_to_plot[[sample_unit]] <- granges_from_bam(
      sample_unit,
      rna_species_to_plot,
      TRUE
    )
  }
  rna_species_plot_data[[rna_species_to_plot]] <- granges_to_plot
}

Organize coverage data

Code
exogenous_rna_count_data <- tibble(
  sample_unit = character(),
  sequence_name = character(),
  category = character(),
  count = numeric()
)
for (rna_species in names(rna_species_plot_data)) {
  for (sample_unit in names(rna_species_plot_data[[rna_species]])) {
    for (category in
      names(rna_species_plot_data[[rna_species]][[sample_unit]])) {
      exogenous_rna_count_data <- exogenous_rna_count_data %>%
        add_row(
          sample_unit = sample_unit,
          sequence_name = rna_species,
          category = category,
          count = length(
            rna_species_plot_data[[rna_species]][[sample_unit]][[category]]
          )
        )
    }
  }
}
exogenous_rna_count_data

Summarize coverage data

Code
exogenous_rna_mapped_totals <- exogenous_rna_count_data %>%
  group_by(sample_unit, sequence_name) %>%
  summarize(mapped_fragments = sum(count))
`summarise()` has grouped output by 'sample_unit'. You can override using the
`.groups` argument.
Code
exogenous_rna_mapped_totals

Human small RNA gene counts

  • count only fragments that were properly aligned
  • annotate with GENCODE gene model
  • primary alignments were counted, even if the fragments aligned multiple times
  • fragments aligning to multiple features were assigned to the feature that mostly closely overlapped with the fragment
  • exogenous RNA counts are total fragments that aligned
Code
human_counts_dir <- "results/smrna_count/"
human_gene_counts_files <- paste0(
  human_counts_dir,
  sample_units$sample_unit,
  "_first_proper_pair_gene_count.txt"
)

human_gene_counts <- readr::read_tsv(
  human_gene_counts_files[1],
  comment = "#",
  col_names = c("gene", human_gene_counts_files[1]),
  col_types = "ci"
)

for (i in 2:length(human_gene_counts_files)) {
  gene_sample <-
    readr::read_tsv(
      human_gene_counts_files[i],
      comment = "#",
      col_names = c("gene", human_gene_counts_files[i]),
      col_types = "ci"
    )
  human_gene_counts <- human_gene_counts %>%
    dplyr::full_join(gene_sample, by = "gene")
}

human_gene_counts <- human_gene_counts %>%
  rename_all(~ stringr::str_replace_all(
    ., human_counts_dir, ""
  )) %>%
  rename_all(~ str_replace_all(., "_first_proper_pair_gene_count.txt", ""))

exogenous_gene_counts <- exogenous_rna_count_data %>%
  tidyr::unite(gene, c(sequence_name, category), sep = ":") %>%
  tidyr::spread(sample_unit, count)

gene_counts <- rbind(human_gene_counts, exogenous_gene_counts)

gene_counts
Code
normalization_text_options <- list(
  "human_small_rna" = "Total Human Small RNA",
  "exogenous_rna" = "Total Exogeouns RNA",
  "exogenous_rna_category" = "Exegenous RNA Activity Category"
)
normalization_text <- normalization_text_options[[params$normalization]]

Coverage plots: Normalized by Total Human Small RNA

Code
normalization_factor <- function(sample_unit, rna_species, category) {
  norm_factor <- NA
  if (params$normalization == "human_small_rna") {
    # Total number of human small rna fragments
    norm_factor <- gene_counts %>%
      pivot_longer(
        cols = -"gene",
        names_to = "sample_unit",
        values_to = "count"
      ) %>%
      group_by(.data$sample_unit) %>%
      summarize(count = sum(.data$count, na.rm = TRUE)) %>%
      dplyr::filter(.data$sample_unit == !!sample_unit) %>%
      pull(count)
  } else if (params$normalization == "exogenous_rna") {
    # Total number of exogenous rna fragments
    norm_factor <- exogenous_rna_mapped_totals %>%
      dplyr::filter(
        .data$sample_unit == !!sample_unit,
        .data$sequence_name == !!rna_species
      ) %>%
      dplyr::pull("mapped_fragments")
  } else if (params$normalization == "exogenous_rna_category") {
    # Number of exogenous rna fragments in a given category
    norm_factor <- exogenous_rna_count_data %>%
      dplyr::filter(
        sample_unit == !!sample_unit,
        sequence_name == !!rna_species,
        category == !!category
      ) %>%
      pull(count)
  } else {
    # Invalid normalization parameter
    stop(sprintf("Invalid normalization parameter %s", params$normalization))
  }
  return(norm_factor)
}
Code
# Line colors and types for each series
plot_styles <- list(
  "Parental" = list( # nolint: object_name_linter
    "active_cis" = list("color" = "#800000", "lty" = "solid"),
    "active_trans" = list("color" = "#a84d3b", "lty" = "solid"),
    "inactive_cryptic_terminiation" = list(
      "color" = "#cb8777",
      "lty" = "dotted"
    ),
    "inactive_other" = list("color" = "#e8c2b9", "lty" = "dotted")
  ),
  "P1E10" = list( # nolint: object_name_linter
    "active_cis" = list("color" = "#003869", "lty" = "solid"),
    "active_trans" = list("color" = "#4f648d", "lty" = "solid"),
    "inactive_cryptic_terminiation" = list(
      "color" = "#8995b2",
      "lty" = "dotted"
    ),
    "inactive_other" = list("color" = "#c3c8d8", "lty" = "dotted")
  )
)

# Loop over species and plot coverage
for (rna_species in rna_mixes %>%
  pull(rna_species) %>%
  unique()) {
  gtrack <- GenomeAxisTrack()

  # Read FASTA sequence and trim off description (after space)
  dna <- readDNAStringSet(
    sprintf("data/references/exogenous-rna/%s.fa", rna_species)
  )
  names(dna) <- names(dna) %>% str_remove(" +.*$")
  sequence_track <- SequenceTrack(dna,
    genome = rna_species,
    chromosome = rna_species,
    cex = 0.5
  )

  # Get list of sample units we will plot
  sample_units_to_plot <- sample_units %>%
    dplyr::filter(sample_unit %in% names(rna_species_plot_data[[rna_species]]))

  # Loop over days to create two overlay tracks
  day_tracks <- list()
  day_ylims <- list()
  days_to_plot <- sample_units_to_plot %>%
    pull(day) %>%
    levels()
  for (day in days_to_plot) {
    # Get sample units for day
    sample_units_for_day <- sample_units_to_plot %>%
      filter(day == {{ day }}) %>%
      pull(sample_unit)

    # Loop over sample units for this day
    data_tracks <- list()
    for (sample_unit in sample_units_for_day) {
      cell_line <- sample_units_to_plot %>%
        filter(sample_unit == {{ sample_unit }}) %>%
        pull(cell_line) %>%
        toString()
      for (category in
        names(rna_species_plot_data[[rna_species]][[sample_unit]])) {
        # Calculate coverage
        data <- as(
          coverage(
            rna_species_plot_data[[rna_species]][[sample_unit]][[category]]
          ),
          "GRanges"
        ) %>%
          # Remove non-matching sequences
          keepSeqlevels(rna_species, pruning.mode = "coarse")

        # Normalize scores
        norm_factor <- normalization_factor(sample_unit, rna_species, category)
        score(data) <- score(data) / norm_factor

        # Create GViz data track
        data_track <- DataTrack(data,
          name = day,
          type = "l",
          col = plot_styles[[cell_line]][[category]]$color,
          lty = plot_styles[[cell_line]][[{{ category }}]]$lty,
          strand = "*"
        )
        data_tracks <- c(data_tracks, data_track)
      }
    } # end sample_unit loop
    day_tracks[[day]] <- OverlayTrack(trackList = data_tracks, name = day)
    day_ylims[[day]] <- ylims <- extendrange(range(lapply(data_tracks, values)))
  } # end day loop

  start <- rna_mixes %>%
    filter(rna_species == {{ rna_species }}) %>%
    pull(start) %>%
    unique()
  end <- rna_mixes %>%
    filter(rna_species == {{ rna_species }}) %>%
    pull(end) %>%
    unique()

  annotations <- read_tsv(
    sprintf("data/references/exogenous-rna/%s.bed", rna_species),
    col_names = c("chrom", "chromStart", "chromEnd", "name"),
    col_types = "ciic"
  )

  annotations_positions <- annotations %>%
    dplyr::filter(name %in% c(
      "cryptic_terminator_end", "sgRNA_end", "edit",
      "nick", "pegRNA_end", "terminator_end"
    ))

  annotation_track <- AnnotationTrack(
    chromosome = rna_species,
    start = annotations$chromStart + 1,
    end = annotations$chromEnd,
    id = annotations$name,
    fill = "#cfafc3",
    featureAnnotation = "id",
    fontcolor.feature = "#666666",
    name = "(e)pegRNA features"
  )

  ht <- HighlightTrack(
    trackList = c(sequence_track, annotation_track, day_tracks),
    start = annotations_positions$chromStart + 1,
    end = annotations_positions$chromEnd,
    chromosome = rna_species,
    col = "darkgrey",
    fill = "#FFFFFF00",
    lwd = 0.5
  )

  plotTracks(c(gtrack, ht),
    chromosome = rna_species,
    from = start,
    to = end,
    ylim = c(0, max(unlist(day_ylims))),
    main = rna_species,
    lwd = 3
  )
}